# Script to clean the ISSP data

# load libraries
library(data.table)
library(tidyverse)
library(haven)
library(dplyr)

data <- ISSP_data
  
# Gender variable
data <- data %>% rename(sex = SEX)
data <- data %>% mutate(woman = ifelse(sex == 2, TRUE, FALSE))
data <- data %>% mutate(woman = ifelse(sex == 9, NA, woman))

# Make country variable
data <- data %>% select(-country)
data <- data %>% rename(country_s = c_alphan)
data <- data %>%  mutate(country_s = ifelse(country_s == "GB-GBN", "GB", country_s))
data <- data %>%  mutate(country_f = as_factor(country_s))
  
# Age and age squared
data <- data %>%  mutate(age = AGE, age2 = AGE^2)
  
# Children variables
data <- data %>% mutate(nchildren = ifelse(HHCHILDR %in% c(96, 99), NA,
                                      ifelse(HHCHILDR > 4, 4, HHCHILDR)))
  
data <- data %>%  mutate(children = ifelse(nchildren > 0, TRUE, FALSE))
  
# Remove if missing data on children  
data <- data %>% filter(!is.na(children))
data <- data %>% filter(between(age, 25, 65))

# Some renaming
data <- data %>% rename(weight = WEIGHT, marst = MARITAL, education = EDUCYRS)
  
# Married indicator
data <- data %>% mutate(married = ifelse(marst == 1, TRUE, FALSE))
data <- data %>% mutate(married = ifelse(marst > 6, NA, married)) # adjust missing values
  
# Remove missing education
data <- data %>% mutate(education = ifelse(education > 25, NA, education))

# Drop extremes in years of education (1% outliers by country)
data <- data %>% 
    group_by(country_f) %>%
    mutate(education = ifelse(between(education, quantile(education, 0.01, na.rm = TRUE), quantile(education, 0.99, na.rm = TRUE)), education, NA)) %>%
    ungroup()

# Hours
data <- data %>% rename(hours = WRKHRS, occupation = ISCO08, status = MAINSTAT, empst = EMPREL)
data <- data %>% mutate(hours = ifelse(hours == 96, 95, ifelse(hours > 96, NA, hours)))
data <- data %>% mutate(hours = ifelse(hours == 0, NA, hours))
data <- data %>% mutate(log_hours = log(hours))
data <- data %>% mutate(level_hours = hours)
data <- data %>% mutate(hours50 = ifelse(hours >= 50, TRUE, FALSE))
  
# Keep only working + not self-employed
data <- data %>% filter(empst == 1 & status == 1)

# Wage variables are separate by country: make unified wage variable
# 1. Convert labelled numeric variables to regular numeric variables
data <- mutate(data,
                 AT_RINC = as.numeric(AT_RINC),
                 AU_RINC = as.numeric(AU_RINC),
                 BE_RINC = as.numeric(BE_RINC),
                 CH_RINC = as.numeric(CH_RINC),
                 CL_RINC = as.numeric(CL_RINC),
                 CN_RINC = as.numeric(CN_RINC),
                 CZ_RINC = as.numeric(CZ_RINC),
                 DE_RINC = as.numeric(DE_RINC),
                 DK_RINC = as.numeric(DK_RINC),
                 EE_RINC = as.numeric(EE_RINC),
                 ES_RINC = as.numeric(ES_RINC),
                 FI_RINC = as.numeric(FI_RINC),
                 FR_RINC = as.numeric(FR_RINC),
                 GB_RINC = as.numeric(GB_RINC),
                 GE_RINC = as.numeric(GE_RINC),
                 HR_RINC = as.numeric(HR_RINC),
                 HU_RINC = as.numeric(HU_RINC),
                 IL_RINC = as.numeric(IL_RINC),
                 IN_RINC = as.numeric(IN_RINC),
                 IS_RINC = as.numeric(IS_RINC),
                 JP_RINC = as.numeric(JP_RINC),
                 LT_RINC = as.numeric(LT_RINC),
                 LV_RINC = as.numeric(LV_RINC),
                 MX_RINC = as.numeric(MX_RINC),
                 NO_RINC = as.numeric(NO_RINC),
                 NZ_RINC = as.numeric(NZ_RINC),
                 PH_RINC = as.numeric(PH_RINC),
                 PL_RINC = as.numeric(PL_RINC),
                 RU_RINC = as.numeric(RU_RINC),
                 SE_RINC = as.numeric(SE_RINC),
                 SI_RINC = as.numeric(SI_RINC),
                 SK_RINC = as.numeric(SK_RINC),
                 SR_RINC = as.numeric(SR_RINC),
                 TW_RINC = as.numeric(TW_RINC),
                 US_RINC = as.numeric(US_RINC),
                 VE_RINC = as.numeric(VE_RINC),
                 ZA_RINC = as.numeric(ZA_RINC)
  )
  
# 2. Create a new unified wage variable
data <- data %>% 
  mutate(
    wage_d = case_when(
        country_s == "AT" ~ AT_RINC,
        country_s == "AU" ~ AU_RINC,
        country_s == "BE" ~ BE_RINC,
        country_s == "CH" ~ CH_RINC,
        country_s == "CL" ~ CL_RINC,
        country_s == "CN" ~ CN_RINC,
        country_s == "CZ" ~ CZ_RINC,
        country_s == "DE" ~ DE_RINC,
        country_s == "DK" ~ DK_RINC,
        country_s == "EE" ~ EE_RINC,
        country_s == "ES" ~ ES_RINC,
        country_s == "FI" ~ FI_RINC,
        country_s == "FR" ~ FR_RINC,
        country_s == "GB" ~ GB_RINC,
        country_s == "GE" ~ GE_RINC,
        country_s == "HR" ~ HR_RINC,
        country_s == "HU" ~ HU_RINC,
        country_s == "IL" ~ IL_RINC,
        country_s == "IN" ~ IN_RINC,
        country_s == "IS" ~ IS_RINC,
        country_s == "JP" ~ JP_RINC,
        country_s == "LT" ~ LT_RINC,
        country_s == "LV" ~ LV_RINC,
        country_s == "MX" ~ MX_RINC,
        country_s == "NO" ~ NO_RINC,
        country_s == "NZ" ~ NZ_RINC,
        country_s == "PH" ~ PH_RINC,
        country_s == "PL" ~ PL_RINC,
        country_s == "RU" ~ RU_RINC,
        country_s == "SE" ~ SE_RINC,
        country_s == "SI" ~ SI_RINC,
        country_s == "SK" ~ SK_RINC,
        country_s == "SR" ~ SR_RINC,
        country_s == "TW" ~ TW_RINC,
        country_s == "US" ~ US_RINC,
        country_s == "VE" ~ VE_RINC,
        country_s == "ZA" ~ ZA_RINC))
  
# Remove if missing wage data
data <- data %>% mutate(wage_d = ifelse(wage_d >= 999990, NA, wage_d))
  
# Remove wage outliers (2.5 and 97.5th percentiles by country)
data <- data %>%
  group_by(country_f) %>%
  mutate(wage_d = ifelse(between(wage_d, quantile(wage_d, 0.025, na.rm = TRUE), quantile(wage_d, 0.975, na.rm = TRUE)), wage_d, NA)) %>%
  ungroup()
  
# Construct Hourly Wages
data <- data %>% mutate(hwage = wage_d / (hours * (52/12)))
  
# Monthly wage quintiles
data <- data %>% group_by(country_f) %>%
  mutate(wage_q = ntile(wage_d, 10))

# Meaning and Flexibility Variables
data <- data %>% rename(meaning = v28, flexibility = v34, telecommute = v32, flex_pref = v10, meaning_pref = v9)
  
data <- data %>%  mutate(meaning = ifelse(meaning %in% c(0, 8, 9), NA, 5 - meaning),
      flexibility = ifelse(flexibility %in% c(0, 8, 9), NA, flexibility),
      telecommute = ifelse(telecommute %in% c(0, 8, 9), NA, 5 - telecommute),
      meaning_pref = ifelse(meaning_pref %in% c(0, 8, 9), NA, 5 - meaning_pref),
      flex_pref = ifelse(flex_pref %in% c(0, 8, 9), NA, 5 - flex_pref))

# Create Dummies for Meaning & Flexibility
data <- data %>%
    mutate(
      high_meaning = ifelse(meaning > 3, 1, 0),
      high_flex = ifelse(flexibility > 1, 1, 0),
      high_telecommute = ifelse(telecommute > 1, 1, 0),
      high_flex_pref = ifelse(flex_pref %in% c(4, 5), 1, 0),
      high_meaning_pref = ifelse(meaning_pref %in% c(4, 5), 1, 0)
    )
  
# Replace missing values with NA
data$high_meaning <- ifelse(is.na(data$meaning), NA, data$high_meaning)
data$high_flex <- ifelse(is.na(data$flexibility), NA, data$high_flex)
data$high_telecommute <- ifelse(is.na(data$telecommute), NA, data$high_telecommute)
data$high_flex_pref <- ifelse(is.na(data$flex_pref), NA, data$high_flex_pref)
data$high_meaning_pref <- ifelse(is.na(data$meaning_pref), NA, data$high_meaning_pref)
  
data <- data %>%
  select(woman, occupation, empst, country_s, country_f, sex, age, age2, marst, 
           children, married, education, hours, log_hours, level_hours, flexibility, 
           telecommute, meaning, flex_pref, meaning_pref, high_flex, high_flex_pref, 
           high_meaning, high_meaning_pref, high_telecommute, wage_d, hwage, wage_q, weight)
  
  clean_ISSP_data = na.omit(data)
  rm(data)